Running mapping and QC pipeline

In [1]:
#add pipeline to path
import sys
import datetime
#the path to the ribosome profilign pipeline code
pipeline_dir = '/home/boris/analysis_pipelines/ribo_seq/'
if pipeline_dir not in sys.path:
    sys.path.append(pipeline_dir)

import ribo_seq_main
import ribo_settings
import ribo_utils
import ribo_lib
import ribo_qc
import ribo_plotting
import ribo_tables

def current_time():
    return datetime.datetime.now().strftime("%Y%m%d_%H%M")
def current_date():
    return datetime.datetime.now().strftime("%Y%m%d")
In [2]:
## Now import the pickled read count objects
import os
import os.path as path
#note - my code is memory inefficcient, as our server has 1TB of ram, probably 64 or 128 would be sufficient
#perform_qc() code is pretty poorly written and not multithreaded, so it takes hours to run for large datasets
'''
can also launch from command line if this doesn't wok, which is reccommended to get usefeul error messages
eg: python /path to/ribo_seq/ribo_seq_main.py set1_settings.json.txt --threads 48 --perform-qc

if pipeline runs successfuly it won't repeat all calculations, but will just load the mapped read counts positions.
'''

#as many cores as you're willing to use:
threads = 48

set1_settings = ribo_settings.ribo_settings('settings_files/set1_settings.json.txt')
set1_experiment = ribo_seq_main.experiment(set1_settings, threads)
set1_experiment.make_tables()
set1_experiment.perform_qc()
set2_settings = ribo_settings.ribo_settings('settings_files/set2_settings.json.txt')
set2_experiment = ribo_seq_main.experiment(set2_settings, threads)
set2_experiment.make_tables()
set2_experiment.perform_qc()
set3_settings = ribo_settings.ribo_settings('settings_files/set3_settings.json.txt')
set3_experiment = ribo_seq_main.experiment(set3_settings, threads)
set3_experiment.make_tables()
set3_experiment.perform_qc()
set4_settings = ribo_settings.ribo_settings('settings_files/set4_settings.json.txt')
set4_experiment = ribo_seq_main.experiment(set4_settings, threads)
set4_experiment.make_tables()
set4_experiment.perform_qc()
unpickling 1A_noDep counts
unpickling 2A_noDep counts
unpickling 1B_riboZero counts
unpickling 2B_riboZero counts
unpickling 1C_riboCop counts
unpickling 2C_riboCop counts
unpickling 1D_NEB counts
unpickling 2D_NEB counts
1A_noDep
total:  617919
primary:  176532
unique mapping:  114743
multiply mapping:  61789
secondary:  441387
unmapped:  0
2A_noDep
total:  760586
primary:  222882
unique mapping:  146575
multiply mapping:  76307
secondary:  537704
unmapped:  0
1B_riboZero
total:  1670276
primary:  502665
unique mapping:  332229
multiply mapping:  170436
secondary:  1167611
unmapped:  0
2B_riboZero
total:  2035629
primary:  605551
unique mapping:  399364
multiply mapping:  206187
secondary:  1430078
unmapped:  0
1C_riboCop
total:  850067
primary:  232051
unique mapping:  149289
multiply mapping:  82762
secondary:  618016
unmapped:  0
2C_riboCop
total:  1494809
primary:  428142
unique mapping:  279442
multiply mapping:  148700
secondary:  1066667
unmapped:  0
1D_NEB
total:  680648
primary:  200166
unique mapping:  124551
multiply mapping:  75615
secondary:  480482
unmapped:  0
2D_NEB
total:  870618
primary:  288130
unique mapping:  184449
multiply mapping:  103681
secondary:  582488
unmapped:  0
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3666: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.
  warnings.warn(msg)
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3672: UserWarning: The `size` paramter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3666: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.
  warnings.warn(msg)
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3672: UserWarning: The `size` paramter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
/home/boris/analysis_pipelines/ribo_seq/ribo_qc.py:98: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  read_summary = pd.concat(dfs)
unpickling NEBNext_20_34 counts
unpickling undepleted_20_34 counts
unpickling NEBNext_20_60 counts
unpickling undepleted_20_60 counts
NEBNext_20_34
total:  1815797
primary:  1017548
unique mapping:  811771
multiply mapping:  205777
secondary:  798249
unmapped:  0
undepleted_20_34
total:  520648
primary:  233779
unique mapping:  162154
multiply mapping:  71625
secondary:  286869
unmapped:  0
NEBNext_20_60
total:  783234
primary:  363023
unique mapping:  267949
multiply mapping:  95074
secondary:  420211
unmapped:  0
undepleted_20_60
total:  130228
primary:  59352
unique mapping:  41360
multiply mapping:  17992
secondary:  70876
unmapped:  0
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3666: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.
  warnings.warn(msg)
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3672: UserWarning: The `size` paramter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3666: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.
  warnings.warn(msg)
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3672: UserWarning: The `size` paramter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
unpickling undepleted_1 counts
unpickling undepleted_2 counts
unpickling ribozero_p counts
unpickling ribozero_p_o counts
unpickling ribozero_p_f counts
unpickling ribozero_p_of counts
unpickling qiagen_fastselect_1 counts
unpickling qiagen_fastselect_2 counts
unpickling ribopools_1 counts
unpickling ribopools_2 counts
unpickling ribopools_no_heat_1 counts
unpickling ribopools_no_heat_2 counts
undepleted_1
total:  3702719
primary:  1476440
unique mapping:  1005527
multiply mapping:  470913
secondary:  2226279
unmapped:  0
undepleted_2
total:  3330089
primary:  1339964
unique mapping:  913441
multiply mapping:  426523
secondary:  1990125
unmapped:  0
ribozero_p
total:  8877186
primary:  5299216
unique mapping:  4462054
multiply mapping:  837162
secondary:  3577970
unmapped:  0
ribozero_p_o
total:  1937470
primary:  1513939
unique mapping:  1397646
multiply mapping:  116293
secondary:  423531
unmapped:  0
ribozero_p_f
total:  1673530
primary:  1389105
unique mapping:  1291848
multiply mapping:  97257
secondary:  284425
unmapped:  0
ribozero_p_of
total:  4049969
primary:  3621560
unique mapping:  3471366
multiply mapping:  150194
secondary:  428409
unmapped:  0
qiagen_fastselect_1
total:  8172884
primary:  3208240
unique mapping:  2164611
multiply mapping:  1043629
secondary:  4964644
unmapped:  0
qiagen_fastselect_2
total:  9629310
primary:  3737855
unique mapping:  2521068
multiply mapping:  1216787
secondary:  5891455
unmapped:  0
ribopools_1
total:  7302788
primary:  2913573
unique mapping:  1985027
multiply mapping:  928546
secondary:  4389215
unmapped:  0
ribopools_2
total:  9387039
primary:  3749713
unique mapping:  2550321
multiply mapping:  1199392
secondary:  5637326
unmapped:  0
ribopools_no_heat_1
total:  17632787
primary:  7269406
unique mapping:  4968576
multiply mapping:  2300830
secondary:  10363381
unmapped:  0
ribopools_no_heat_2
total:  18222836
primary:  7387893
unique mapping:  5031257
multiply mapping:  2356636
secondary:  10834943
unmapped:  0
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3666: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.
  warnings.warn(msg)
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3672: UserWarning: The `size` paramter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3666: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.
  warnings.warn(msg)
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3672: UserWarning: The `size` paramter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
unpickling undepleted counts
unpickling oligo_depletion counts
undepleted
total:  5053519
primary:  1331622
unique mapping:  838317
multiply mapping:  493305
secondary:  3721897
unmapped:  0
oligo_depletion
total:  26685062
primary:  8188407
unique mapping:  5368748
multiply mapping:  2819659
secondary:  18496655
unmapped:  0
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3666: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.
  warnings.warn(msg)
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3672: UserWarning: The `size` paramter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3666: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.
  warnings.warn(msg)
/home/boris/.local/lib/python2.7/site-packages/seaborn/categorical.py:3672: UserWarning: The `size` paramter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

Figures 1B,D,F and 4A: stacked bar plots of mapping fractions

In [3]:
import pandas as pd

summary_set1 = pd.read_csv(os.path.join(set1_experiment.settings.get_rdir(), 'QC', 'consolidated_read_annotation_summary.tsv'), sep='\t').replace(float('NaN'),0).drop(columns=['Unnamed: 0',  'reads processed'])
summary_set2 = pd.read_csv(os.path.join(set2_experiment.settings.get_rdir(), 'QC', 'consolidated_read_annotation_summary.tsv'), sep='\t').replace(float('NaN'),0).drop(columns=['Unnamed: 0', 'reads processed'])
summary_set3 = pd.read_csv(os.path.join(set3_experiment.settings.get_rdir(), 'QC', 'consolidated_read_annotation_summary.tsv'), sep='\t').replace(float('NaN'),0).drop(columns=['Unnamed: 0',  'reads processed'])
summary_set4 = pd.read_csv(os.path.join(set4_experiment.settings.get_rdir(), 'QC', 'consolidated_read_annotation_summary.tsv'), sep='\t').replace(float('NaN'),0).drop(columns=['Unnamed: 0',  'reads processed'])
In [4]:
#code for condensing dataframes
singular_categories = ['sample', '<10 nt', 'rRNA_mapping']
UTR3 = ['multiple mapping_3p_UTR','unique mapping_3p_UTR']
UTR5 = ['multiple mapping_5p_UTR','unique mapping_5p_UTR']
CDS = ['multiple mapping_CDS', 'unique mapping_CDS',]
intron = [ 'unique mapping_sense_intronic','unique mapping_retained_intron', 'unique mapping_protein_coding', 'unique mapping_nonsense_mediated_decay', 'multiple mapping_nonsense_mediated_decay','multiple mapping_protein_coding', 'multiple mapping_retained_intron','multiple mapping_sense_intronic']

combined_cats = singular_categories + UTR3 + UTR5 + CDS + intron

other = ['unique mapping_sense_overlapping', 'unique mapping_snoRNA', 'unique mapping_transcribed_processed_pseudogene', 'unique mapping_transcribed_unitary_pseudogene', 'unique mapping_transcribed_unprocessed_pseudogene', 'unique mapping_translated_processed_pseudogene', 'unique mapping_unitary_pseudogene', 'unique mapping_unprocessed_pseudogene','unique mapping_processed_transcript', 'unique mapping_ribozyme', 'unique mapping_sRNA', 'unique mapping_scaRNA', 'unique mapping_non_stop_decay',  'unique mapping_not annotated', 'unique mapping_processed_pseudogene', 'unique mapping_bidirectional_promoter_lncRNA', 'multiple mapping_non_stop_decay', 'unique mapping_lincRNA', 'unique mapping_miRNA', 'unique mapping_misc_RNA', 'unique mapping_TEC', 'unique mapping_TR_V_pseudogene', 'unique mapping_antisense_RNA','snRNA_mapping', 'tRNA_mapping', 'multiple mapping_transcribed_unprocessed_pseudogene', 'multiple mapping_translated_processed_pseudogene', 'multiple mapping_unitary_pseudogene', 'multiple mapping_unprocessed_pseudogene', 'multiple mapping_ribozyme', 'multiple mapping_scaRNA', 'multiple mapping_sense_overlapping', 'multiple mapping_snoRNA', 'multiple mapping_transcribed_processed_pseudogene', 'multiple mapping_transcribed_unitary_pseudogene','multiple mapping_not annotated', 'multiple mapping_processed_pseudogene', 'multiple mapping_processed_transcript','multiple mapping_IG_C_pseudogene', 'multiple mapping_TEC', 'multiple mapping_antisense_RNA', 'multiple mapping_bidirectional_promoter_lncRNA', 'multiple mapping_lincRNA', 'multiple mapping_miRNA', 'multiple mapping_misc_RNA',]

def condense_df(summary_df):
    ##collapses read categories and converts to percent
    ## discards PhiX reads from calculation
    condensed_df = pd.DataFrame()

    for cat in singular_categories:
        condensed_df[cat] = summary_df[cat]

    condensed_df['UTR'] = summary_df[UTR5+UTR3].sum(axis=1)
    condensed_df['CDS'] = summary_df[CDS].sum(axis=1)
    condensed_df['intron'] = summary_df[intron].sum(axis=1)
    condensed_df['other'] = summary_df[list(set(summary_df.columns).difference(combined_cats+['NC_001422.1_phi-X174', 'input reads']))].sum(axis=1)
    adjusted_total = summary_df['input reads']-summary_df['NC_001422.1_phi-X174']
    for column in condensed_df.columns:
        if not column == 'sample':
            condensed_df[column] = 100.*condensed_df[column].div(adjusted_total, axis=0)
    
    condensed_df['unmapped'] = 100.-condensed_df.sum(axis=1)
    condensed_df = condensed_df.rename(columns={'rRNA_mapping': 'rRNA'})
    
    return condensed_df
In [5]:
condensed_df_set1 = condense_df(summary_set1)
condensed_df_set2 = condense_df(summary_set2)
condensed_df_set3 = condense_df(summary_set3)
condensed_df_set4 = condense_df(summary_set4)
In [6]:
#code for stacked bar plots
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
plt.rcParams['pdf.fonttype'] = 42 #leaves most text as actual text in PDFs, not outlines

def plot_summary_bar(summary_df, out_name, order=['rRNA', '<10 nt', 'unmapped', 'other', 'intron', 'UTR', 'CDS'], sample_order=None):
    if not sample_order==None:
        summary_df = summary_df.reindex(sample_order)
    ind = np.arange(len(summary_df))  # the x locations for the groups
    width = 0.8  # the width of the bars: can also be len(x) sequence
    plotLayers = []
    bottoms = [0] * len(summary_df)
    bottoms = np.array(bottoms)

    fig = plt.figure(figsize=(len(summary_df), 4))
    num_plots_wide = 1
    num_plots_high = 1
    plot = fig.add_subplot(num_plots_high, num_plots_wide, 1)
    
    black = (0,0,0)
    gray = (127/255.0,127/255.0,127/255.0)
    orange = (230/255.0,159/255.0,0)
    skyBlue = (86/255.0,180/255.0,233/255.0)
    bluishGreen = (0,158/255.0,115/255.0)
    yellow = (240/255.0,228/255.0,66/255.0)
    blue = (0,114/255.0,178/255.0)
    vermillion = (213/255.0,94/255.0,0)
    reddishPurple = (204/255.0,121/255.0,167/255.0)
    colors = [gray, orange, skyBlue, bluishGreen, blue, reddishPurple, vermillion, yellow]
    color_index = 0
    for anno_type in order:
        plotLayers.append(plot.bar(ind, summary_df[anno_type].values, width, bottom=bottoms, color=colors[color_index], label=anno_type, hatch=None))
        color_index += 1
        bottoms = bottoms + np.array(summary_df[anno_type].values)

    plt.ylabel('percent of reads')
    #plt.title('summary of read loss in pipeline')
    plot.set_xticks(ind)
    plot.set_xticklabels(summary_df['sample'].values, rotation=85)
    plt.tight_layout()
    # Shrink current axis by 40%
    box = plot.get_position()
    plot.set_position([box.x0, box.y0, box.width * 0.6, box.height])
    # Put a legend to the right of the current axis
    plot.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plot.set_ylim(0, 100)
    plot.spines['right'].set_visible(False)
    plot.spines['top'].set_visible(False)
    # plt.subplots_adjust(bottom=0.38, right=0.8)
    plt.savefig(out_name, transparent=True)
In [7]:
plot_summary_bar(condensed_df_set1, 'outputs/figure1B.pdf')
In [8]:
plot_summary_bar(condensed_df_set2, 'outputs/figure1D.pdf', sample_order = [1, 3, 0, 2])
In [9]:
plot_summary_bar(condensed_df_set3, 'outputs/figure2F.pdf')
In [10]:
plot_summary_bar(condensed_df_set4, 'outputs/figure4A.pdf')

Figures 1C,E,G and 4B: read length distributions

In [11]:
#collect frag lengths
import pandas as pd
import numpy as np

def get_frag_lengths(experiment):
    dfs = []
    for lib in experiment.libs:
        frag_dict = lib.get_all_CDS_fragment_length_counts()
        frag_lengths = sorted(frag_dict.keys())
        frag_length_counts = [frag_dict[length] for length in frag_lengths]
        d = {'fragment length':frag_lengths, '# reads':frag_length_counts, '% reads':100.*np.array(frag_length_counts)/sum(frag_length_counts), 'sample':[lib.lib_settings.sample_name]*len(frag_length_counts)}
        temp_df = pd.DataFrame(data=d)
        dfs.append(temp_df)
    return pd.concat(dfs)

set1_frag_lengths = get_frag_lengths(set1_experiment)
set2_frag_lengths = get_frag_lengths(set2_experiment)
set3_frag_lengths = get_frag_lengths(set3_experiment)
set4_frag_lengths = get_frag_lengths(set4_experiment)

set1_frag_lengths.to_csv('outputs/set1_frag_lengths.tsv', sep='\t')
set2_frag_lengths.to_csv('outputs/set2_frag_lengths.tsv', sep='\t')
set3_frag_lengths.to_csv('outputs/set3_frag_lengths.tsv', sep='\t')
set4_frag_lengths.to_csv('outputs/set4_frag_lengths.tsv', sep='\t')
In [12]:
set1_frag_lengths=pd.read_csv('outputs/set1_frag_lengths.tsv', sep='\t')
set2_frag_lengths=pd.read_csv('outputs/set2_frag_lengths.tsv', sep='\t')
set3_frag_lengths=pd.read_csv('outputs/set3_frag_lengths.tsv', sep='\t')
set4_frag_lengths=pd.read_csv('outputs/set4_frag_lengths.tsv', sep='\t')
In [13]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['pdf.fonttype'] = 42 #leaves most text as actual text in PDFs, not outlines

def plot_length_dist(frag_length_df, outname, datasets=None):
    if datasets == None:
        datasets = frag_length_df['sample'].unique()
    
    end = '5p'
    fig = plt.figure(figsize=(5, 5))
    plots = []
    
    plot = fig.add_subplot(111)
    color_index = 0
    group_df=frag_length_df.groupby(['sample'])
    for sample in datasets:
        df = group_df.get_group(sample)
        #df_5p.sort_values(by=['position'], inplace=True)
        df.plot(x='fragment length', y='% reads', ax=plot, color = ribo_utils.rainbow[color_index%len(ribo_utils.rainbow)], 
                linestyle=ribo_utils.line_styles[color_index/len(ribo_utils.rainbow)], lw=2, sharex=True, sharey=True, label = sample)
        color_index += 1
        #plot.set_title('%s %smer' % (sample, str(fragment_length)), y=0.8, loc='left')
    plots.append(plot)
    major_xticks=range(12, 60, 3)
    #minor_xticks=plot.get_xticks()[::3]
    #major_tick_labels=plot.get_xticklabels()[::6]
    #plot.set_xticks(minor_xticks, minor=True)
    plot.set_xticks(major_xticks, minor=False)
    #plot.set_xticklabels(major_tick_labels)
    plot.set_ylabel('% CDS-mapping reads', fontsize=20)
    plot.set_xlabel('fragment length', fontsize=20)
    # Hide the right and top spines
    plot.spines['right'].set_visible(False)
    plot.spines['top'].set_visible(False)
    plot.set_xlim(15, 38)
    plot.set_ylim(0,25)
    plt.savefig(outname, transparent=True)
In [14]:
plot_length_dist(set1_frag_lengths, 'outputs/figure1C.pdf', datasets = None)
In [15]:
plot_length_dist(set2_frag_lengths, 'outputs/figure1E.pdf', datasets = None)
In [16]:
plot_length_dist(set3_frag_lengths, 'outputs/figure1G.pdf', datasets = ['undepleted_1', 'undepleted_2', 'ribozero_p', 'ribozero_p_o', 'ribozero_p_f', 'ribozero_p_of' ])
In [17]:
plot_length_dist(set4_frag_lengths, 'outputs/figure4B.pdf', datasets = None)

Figure 2 and 4D: compare CDS counts

In [21]:
#get CDS-mapping footprint counts from pipeline output
set1_cds_counts = pd.read_csv(os.path.join(set1_experiment.settings.get_rdir(), 'tables', 'cds_counts.tsv'), sep = '\t')
set2_cds_counts = pd.read_csv(os.path.join(set2_experiment.settings.get_rdir(), 'tables', 'cds_counts.tsv'), sep = '\t')
set3_cds_counts = pd.read_csv(os.path.join(set3_experiment.settings.get_rdir(), 'tables', 'cds_counts.tsv'), sep = '\t')
set4_cds_counts = pd.read_csv(os.path.join(set4_experiment.settings.get_rdir(), 'tables', 'cds_counts.tsv'), sep = '\t')

set1_cds_counts.to_csv('outputs/set1_CDS_counts.tsv', sep='\t', index=False)
set2_cds_counts.to_csv('outputs/set2_CDS_counts.tsv', sep='\t', index=False)
set3_cds_counts.to_csv('outputs/set3_CDS_counts.tsv', sep='\t', index=False)
set4_cds_counts.to_csv('outputs/set4_CDS_counts.tsv', sep='\t', index=False)
In [22]:
def get_rpm_df(df):
    rpm_df = pd.DataFrame()
    rpm_df[['tx_id', 'gene_name']] = df[['tx_id', 'gene_name']]
    for column in df.columns:
        if column not in ['tx_id', 'gene_name']:
            rpm_df[column] = df[column]/(sum(df[column])/10.**6)
    return rpm_df
In [25]:
#convert counts to reads per million
set1_cds_rpms = get_rpm_df(set1_cds_counts)
set2_cds_rpms = get_rpm_df(set2_cds_counts)
set3_cds_rpms = get_rpm_df(set3_cds_counts)
set4_cds_rpms = get_rpm_df(set4_cds_counts)
set1_cds_rpms.to_csv('outputs/set1_CDS_rpms.tsv', sep='\t', index=False)
set2_cds_rpms.to_csv('outputs/set2_CDS_rpms.tsv', sep='\t', index=False)
set3_cds_rpms.to_csv('outputs/set3_CDS_rpms.tsv', sep='\t', index=False)
set4_cds_rpms.to_csv('outputs/set4_CDS_rpms.tsv', sep='\t', index=False)
In [26]:
#filter for read counts of at least an average of 64 per library, and 1 read per million in at least 1 dataset
set2_cds_counts = set2_cds_counts[set2_cds_counts.mean(axis=1)>=64]
set3_cds_counts = set3_cds_counts[set3_cds_counts.mean(axis=1)>=64]
set4_cds_counts = set4_cds_counts[set4_cds_counts.mean(axis=1)>=64]

set2_cds_rpms = set2_cds_rpms[set2_cds_rpms['tx_id'].isin(set2_cds_counts['tx_id'])]
set3_cds_rpms = set3_cds_rpms[set3_cds_rpms['tx_id'].isin(set3_cds_counts['tx_id'])]
set4_cds_rpms = set4_cds_rpms[set4_cds_rpms['tx_id'].isin(set4_cds_counts['tx_id'])]

set2_cds_rpms = set2_cds_rpms[set2_cds_rpms.min(axis=1)>=1]
set3_cds_rpms = set3_cds_rpms[set3_cds_rpms.min(axis=1)>=1]
set4_cds_rpms = set4_cds_rpms[set4_cds_rpms.min(axis=1)>=1]
In [27]:
#plot read counts
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
from adjustText import adjust_text
import math

plt.rcParams['pdf.fonttype'] = 42 #leaves most text as actual text in PDFs, not outlines
##plot CDF or mRNA RPKMs
black = (0,0,0)
gray = (0.5,0.5,0.5)
orange = (230/255.0,159/255.0,0)
skyBlue = (86/255.0,180/255.0,233/255.0)
bluishGreen = (0,158/255.0,115/255.0)
yellow = (240/255.0,228/255.0,66/255.0)
blue = (0,114/255.0,178/255.0)
vermillion = (213/255.0,94/255.0,0)
reddishPurple = (204/255.0,121/255.0,167/255.0)
colors = [black, orange, skyBlue, bluishGreen, vermillion, blue, reddishPurple, yellow]
rainbow = [black, vermillion, orange, bluishGreen, blue, reddishPurple, 'violet']

df = set2_cds_rpms
pairs = [('undepleted_20_34', 'undepleted_20_60'),  ('undepleted_20_34', 'NEBNext_20_34'), ('undepleted_20_60', 'NEBNext_20_60') , ('NEBNext_20_34','NEBNext_20_60')]

plot_index=1
num_plots_wide=len(pairs)
num_plots_high=1
fig = plt.figure(figsize=(2.5*num_plots_wide, 2.5*num_plots_high))
plot_index = 1

for pair in pairs:
    plot = fig.add_subplot(num_plots_high, num_plots_wide, plot_index)
    df.plot.scatter(pair[0], pair[1], color=black, s=2, alpha=0.5, ax=plot)
    pearson_r, pearson_p = stats.pearsonr(np.log10(df[pair[0]]), np.log10(df[pair[1]]))
    plot.text(50, 15000, 'r2=%.4f (%d)' % (pearson_r**2, len(df[pair[0]])))
    plot.set_ylim(10, 20000)
    plot.set_xlim(10, 20000)
    plot.spines['right'].set_visible(False)
    plot.spines['top'].set_visible(False)
    plot_index+=1
    plot.set_xscale('log')
    plot.set_yscale('log')
    #plot.set_xticks([10**x for x in range(-2, 2)])
    #plot.set_yticks([10**x for x in range(-2, 2)])
    plot.yaxis.set_minor_locator(plt.NullLocator())
    plot.xaxis.set_minor_locator(plt.NullLocator())
    plot.set_aspect('equal', 'box')
#lg=plt.legend(loc=4,prop={'size':12}, labelspacing=0.2)
#lg.draw_frame(False)
plt.tight_layout()
plt.savefig('outputs/figure2A.pdf', transparent='True', format='pdf')
In [28]:
#plot read counts
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
from adjustText import adjust_text
import math

plt.rcParams['pdf.fonttype'] = 42 #leaves most text as actual text in PDFs, not outlines
##plot CDF or mRNA RPKMs
black = (0,0,0)
gray = (0.5,0.5,0.5)
orange = (230/255.0,159/255.0,0)
skyBlue = (86/255.0,180/255.0,233/255.0)
bluishGreen = (0,158/255.0,115/255.0)
yellow = (240/255.0,228/255.0,66/255.0)
blue = (0,114/255.0,178/255.0)
vermillion = (213/255.0,94/255.0,0)
reddishPurple = (204/255.0,121/255.0,167/255.0)
colors = [black, orange, skyBlue, bluishGreen, vermillion, blue, reddishPurple, yellow]
rainbow = [black, vermillion, orange, bluishGreen, blue, reddishPurple, 'violet']

df = set3_cds_rpms
pairs = [('undepleted_1', 'undepleted_2'),  ('undepleted_1', 'ribozero_p'), ('undepleted_1', 'ribozero_p_o') , 
         ('undepleted_1', 'ribozero_p_f'), ('undepleted_1', 'ribozero_p_of')]

plot_index=1
num_plots_wide=len(pairs)
num_plots_high=1
fig = plt.figure(figsize=(2.5*num_plots_wide, 2.5*num_plots_high))
plot_index = 1

for pair in pairs:
    plot = fig.add_subplot(num_plots_high, num_plots_wide, plot_index)
    df.plot.scatter(pair[0], pair[1], color=black, s=2, alpha=0.5, ax=plot)
    pearson_r, pearson_p = stats.pearsonr(np.log10(df[pair[0]]), np.log10(df[pair[1]]))
    plot.text(5, 15000, 'r2=%.4f (%d)' % (pearson_r**2, len(df[pair[0]])))
    plot.set_ylim(1, 50000)
    plot.set_xlim(1, 50000)
    plot.spines['right'].set_visible(False)
    plot.spines['top'].set_visible(False)
    plot_index+=1
    plot.set_xscale('log')
    plot.set_yscale('log')
    #plot.set_xticks([10**x for x in range(-2, 2)])
    #plot.set_yticks([10**x for x in range(-2, 2)])
    plot.yaxis.set_minor_locator(plt.NullLocator())
    plot.xaxis.set_minor_locator(plt.NullLocator())
    plot.set_aspect('equal', 'box')

#lg=plt.legend(loc=4,prop={'size':12}, labelspacing=0.2)
#lg.draw_frame(False)
plt.tight_layout()
plt.savefig('outputs/figure2B.pdf', transparent='True', format='pdf')
In [29]:
#plot read counts
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
from adjustText import adjust_text
import math

plt.rcParams['pdf.fonttype'] = 42 #leaves most text as actual text in PDFs, not outlines
##plot CDF or mRNA RPKMs
black = (0,0,0)
gray = (0.5,0.5,0.5)
orange = (230/255.0,159/255.0,0)
skyBlue = (86/255.0,180/255.0,233/255.0)
bluishGreen = (0,158/255.0,115/255.0)
yellow = (240/255.0,228/255.0,66/255.0)
blue = (0,114/255.0,178/255.0)
vermillion = (213/255.0,94/255.0,0)
reddishPurple = (204/255.0,121/255.0,167/255.0)
colors = [black, orange, skyBlue, bluishGreen, vermillion, blue, reddishPurple, yellow]
rainbow = [black, vermillion, orange, bluishGreen, blue, reddishPurple, 'violet']

df = set4_cds_rpms
pairs = [('undepleted', 'oligo_depletion')]

plot_index=1
num_plots_wide=len(pairs)
num_plots_high=1
fig = plt.figure(figsize=(2.5*num_plots_wide, 2.5*num_plots_high))
plot_index = 1

for pair in pairs:
    plot = fig.add_subplot(num_plots_high, num_plots_wide, plot_index)
    df.plot.scatter(pair[0], pair[1], color=black, s=2, alpha=0.5, ax=plot)
    pearson_r, pearson_p = stats.pearsonr(np.log10(df[pair[0]]), np.log10(df[pair[1]]))
    plot.text(5, 15000, 'r2=%.4f (%d)' % (pearson_r**2, len(df[pair[0]])))
    plot.set_ylim(1, 50000)
    plot.set_xlim(1, 50000)
    plot.spines['right'].set_visible(False)
    plot.spines['top'].set_visible(False)
    plot_index+=1
    plot.set_xscale('log')
    plot.set_yscale('log')
    #plot.set_xticks([10**x for x in range(-2, 2)])
    #plot.set_yticks([10**x for x in range(-2, 2)])
    plot.yaxis.set_minor_locator(plt.NullLocator())
    plot.xaxis.set_minor_locator(plt.NullLocator())
    plot.set_aspect('equal', 'box')

#lg=plt.legend(loc=4,prop={'size':12}, labelspacing=0.2)
#lg.draw_frame(False)
plt.tight_layout()
plt.savefig('outputs/figure4D.pdf', transparent='True', format='pdf')

Figure 3, S1, S2: average gene plots by read length

First, I need to plot start codon metaplots for short and long footprints to get the offsets for the undepleted samples, then pool them (keeping short and long FP seperate), and get reading frames

Start Codon Metaplots

In [30]:
import numpy as np
import pandas as pd

def start_meta(experiment, up=200, down=200):
    """
    up, down - how far up or down of start to align
    """
    positions = range(-1*up,down+1)
    frag_lengths = [[f] for f in range(15, 38)]
    dfs = []
    for lib in experiment.libs:
        for frag_length in frag_lengths:
            for end in '5p', '3p':
                normed_count_sum = np.zeros(down + up + 1)
                count_sum = np.zeros(down + up + 1)
                inclusion_sum = np.zeros(down + up + 1)
                assert len(positions) == len(normed_count_sum)
                for transcript in lib.transcripts.values():
                    if True: #could swap lines if there's a group of transcripts in mind
                    #if transcript.tx_id in tx_to_use:
                        #being conservatively broad for normalization
                        cds_reads = transcript.get_cds_read_count(-30, 30, read_end=end, read_lengths=frag_length)
                        if cds_reads>=10:
                            tx_count, tx_inclusion = transcript.get_CDS_read_counts_array(transcript.cds_start, -1*up, down, read_end=end, read_lengths=frag_length)
                            normed_count_sum += tx_count/(float(cds_reads)/transcript.cds_length)
                            count_sum += tx_count
                            inclusion_sum += tx_inclusion
                normed_by_inclusion = normed_count_sum/inclusion_sum
                d = {'fragment length':[frag_length[0]]*len(positions), 'position':positions, 'reads': count_sum, 'normed reads':normed_count_sum, 'included transcripts':inclusion_sum,
                     'inclusion normalized reads':normed_by_inclusion, 'end':[end]*len(positions), 'sample':[lib.lib_settings.sample_name]*len(positions)}
                temp_df = pd.DataFrame(data=d)
                dfs.append(temp_df)         
    return pd.concat(dfs)
In [31]:
set1_start_meta = start_meta(set1_experiment)
set2_start_meta = start_meta(set2_experiment)
set3_start_meta = start_meta(set3_experiment)
set4_start_meta = start_meta(set4_experiment)

set1_start_meta.to_csv('outputs/set1_start_meta.tsv', sep='\t')
set2_start_meta.to_csv('outputs/set2_start_meta.tsv', sep='\t')
set3_start_meta.to_csv('outputs/set3_start_meta.tsv', sep='\t')
set4_start_meta.to_csv('outputs/set4_start_meta.tsv', sep='\t')
/home/boris/.local/lib/python2.7/site-packages/ipykernel_launcher.py:28: RuntimeWarning: invalid value encountered in divide
In [32]:
set1_start_meta = pd.read_csv('outputs/set1_start_meta.tsv', sep='\t')
set2_start_meta = pd.read_csv('outputs/set2_start_meta.tsv', sep='\t')
set3_start_meta = pd.read_csv('outputs/set3_start_meta.tsv', sep='\t')
set4_start_meta = pd.read_csv('outputs/set4_start_meta.tsv', sep='\t')
In [33]:
def plot_meta_heatmap(input_df, sample_names, outname, end='5p', xlabel=None, positions=range(-30,30), frag_lengths=range(20, 38)):
    %matplotlib inline
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import seaborn as sns
    #sns.set(style="white", color_codes=True)
    sns.set_style("ticks", {"xtick.major.size": 8, "xtick.minor.size": 4, "ytick.major.size": 8})
    fig = plt.figure(figsize=(8, 2*len(sample_names)))
    num_plots_high=len(sample_names)
    num_plots_wide=1
    plot_index=0
    
    input_df = input_df[input_df['fragment length'].isin(frag_lengths)]
    group_df = input_df.groupby(['sample', 'end'])
    for sample in sample_names:
        sample_df = group_df.get_group((sample, end))
        sample_df = sample_df[sample_df['position'].isin(positions)]
        plot = fig.add_subplot(num_plots_high, num_plots_wide, plot_index + 1)
        metaplot = sample_df.pivot("fragment length", "position", "reads")
        sns.heatmap(metaplot, square=True, robust=False, ax=plot, cmap='gray_r')#The robust parameter sets the scale by the data quartiles, not the max and min
        major_xticks=plot.get_xticks()[::3]
        minor_xticks=plot.get_xticks()
        major_tick_labels=plot.get_xticklabels()[::3]
        plot.set_xticklabels(major_tick_labels)
        plot.set_xticks(minor_xticks, minor=True)
        plot.set_xticks(major_xticks, minor=False)
        plot.set_title(sample, fontsize=18)
        if not xlabel==None:
            plot.set_xlabel(xlabel, fontsize=8)
        plot.set_ylabel('fragment length', fontsize=8)
        plot_index+=1
    plt.tight_layout()
    plt.savefig(outname, transparent=True)
In [34]:
outname='outputs/figureS1A.pdf'
plot_meta_heatmap(set1_start_meta, ['1A_noDep', '2A_noDep', '1D_NEB', '2D_NEB'], outname, end='5p', positions=range(-30,30), xlabel=u'fragment 5΄ end relative to first nt of CDS')
In [35]:
outname='outputs/figureS1B.pdf'
plot_meta_heatmap(set2_start_meta, ['undepleted_20_34', 'undepleted_20_60', 'NEBNext_20_34', 'NEBNext_20_60',], outname, end='5p', positions=range(-30,30), xlabel=u'fragment 5΄ end relative to first nt of CDS')
In [36]:
outname='outputs/figureS2.pdf'
plot_meta_heatmap(set3_start_meta, ["undepleted_1","undepleted_2","ribozero_p","ribozero_p_o","ribozero_p_f","ribozero_p_of"], outname, end='5p', positions=range(-30,30), xlabel=u'fragment 5΄ end relative to first nt of CDS')
In [37]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['pdf.fonttype'] = 42 #leaves most text as actual text in PDFs, not outlines


def plot_start_meta(df, outname, end = '5p', frag_lengths = range(18,34), ylim=(0,10), xlim=(-50, 100)):
    if frag_lengths == 'all':
        frag_lengths = df['fragment length'].unique()
    fig = plt.figure(figsize=(len(df['sample'].unique())*4, len(frag_lengths)*2))
    plot_index = 1
    plots = []
    group_df = df.groupby(['sample', 'fragment length', 'end'])
    for fragment_length in frag_lengths:
        for sample in df['sample'].unique():
            plot = fig.add_subplot(len(frag_lengths), len(df['sample'].unique()), plot_index)
            sub_df = group_df.get_group((sample, fragment_length, end))
            #df_5p.sort_values(by=['position'], inplace=True)
            sub_df.plot(y='inclusion normalized reads',  x='position', ax=plot, color = ribo_utils.rainbow[0], lw=2)
            plot.set_title('%s %smer' % (sample, str(fragment_length)), y=0.8, loc='left')
            plot.vlines([-12,-13, -14], 0, 10, linestyle='dashed')
            plots.append(plot)
            plot_index += 1

    for plot in plots:
        plot.set_ylim(ylim[0],ylim[1])
        #plot.set_xticks(range(upstream, downstream))
        #major_xticks=plot.get_xticks()[::9]
        #minor_xticks=plot.get_xticks()[::3]
        #major_tick_labels=plot.get_xticklabels()[::6]
        #print plot.get_xticklabels()[::6]
        #plot.set_xticks(minor_xticks, minor=True)
        #plot.set_xticks(major_xticks, minor=False)
        #plot.set_xticklabels(major_tick_labels)
        plot.set_ylabel('mean normalized reads')
        plot.set_xlabel('nt from CDS start')
        # Hide the right and top spines
        plot.spines['right'].set_visible(False)
        plot.spines['top'].set_visible(False)
        plot.set_xlim(xlim[0], xlim[1])
        plot.legend_.remove()

    plt.tight_layout()
    plt.savefig(outname, transparent=True)
In [38]:
plot_start_meta(set1_start_meta, 'outputs/set1_by_length_start.pdf', end = '5p', frag_lengths = range(20,37), ylim=(0,10), xlim=(-20, 0))  
In [39]:
plot_start_meta(set2_start_meta, 'outputs/set2_by_length_start.pdf', end = '5p', frag_lengths = range(20,33), ylim=(0,10), xlim=(-20, 0))  
In [40]:
plot_start_meta(set3_start_meta, 'outputs/set3_by_length_start.pdf', end = '5p', frag_lengths = range(20,33), ylim=(0,10), xlim=(-20, 0))  
In [41]:
plot_start_meta(set4_start_meta, 'outputs/set4_by_length_start.pdf', end = '5p', frag_lengths = range(18,38), ylim=(0,15), xlim=(-20, 0))  
In [42]:
#first need to define the offsets based on the start codon metaplots:
set1_long_offsets = {30:-12, 31:-12, 32:-13, 33:-14, 34:-14, 35:-14}
set2_short_offsets = {20:-12, 21:-12, 22:-13, 23:-13}
set2_long_offsets = {25:-12, 26:-12, 27:-13, 28:-13, 29:-12, 30:-13, 31:-13, 32:-14}
set3_short_offsets = {20:-12, 21:-12, 22:-13, 23:-13}
set3_long_offsets = {25:-12, 26:-12, 27:-13, 28:-13, 29:-12, 30:-13, 31:-13, 32:-14}
set4_short_offsets = {20:-12, 21:-12, 22:-13, 23:-13}
set4_long_offsets = {25:-12, 26:-12, 27:-12, 28:-12, 29:-12, 30:-13, 31:-13, 32:-13}

make joint pooled offset metaplots

In [43]:
import numpy as np
import pandas as pd

def start_meta_pooled_offsets(experiment, offsets_5p, up=100, down=200):
    """
    up, down - how far up or down of start to align
    offsets_5p - a list of offset dictionaries of type {20:-12, 21:-12, 22:-13, 23:-13}, which are the distances of the read 5' end to the first nt of the P site of the
    """
    positions = range(-1*up,down+1)
    dfs = []
    for offset_dict in offsets_5p:
        for lib in experiment.libs:
            for end in '5p', '3p':
                normed_count_sum = np.zeros(down + up + 1)
                count_sum = np.zeros(down + up + 1)
                inclusion_sum = np.zeros(down + up + 1)
                assert len(positions) == len(normed_count_sum)
                frag_lengths = offset_dict.keys()
                for transcript in lib.transcripts.values():
                    #being conservatively broad for normalization
                    cds_reads = transcript.get_cds_read_count(-30, 30, read_end=end, read_lengths=frag_lengths)
                    if cds_reads>=16:
                        tx_count, tx_inclusion = np.zeros(down + up + 1), np.zeros(down + up + 1)
                        for frag_length in frag_lengths:
                            if end == '5p':
                                partial_tx_count, partial_tx_inclusion = transcript.get_CDS_read_counts_array(transcript.cds_start+offset_dict[frag_length], (-1*up), down, read_end=end, read_lengths=[frag_length])
                            if end == '3p':
                                partial_tx_count, partial_tx_inclusion = transcript.get_CDS_read_counts_array(transcript.cds_start+((frag_length+offset_dict[frag_length])-1), (-1*up), down, read_end=end, read_lengths=[frag_length])
                            tx_count += partial_tx_count
                            tx_inclusion += partial_tx_inclusion
                        tx_inclusion = tx_inclusion/len(frag_lengths)
                        normed_count_sum += tx_count/(float(cds_reads)/transcript.cds_length)
                        count_sum += tx_count
                        inclusion_sum += tx_inclusion
                normed_by_inclusion = normed_count_sum/inclusion_sum
                d = {'fragment lengths':[','.join(['%d' % x for x in sorted(frag_lengths)])]*len(positions), 'position':positions, 'reads': count_sum, 'normed reads':normed_count_sum, 'included transcripts':inclusion_sum,
                     'inclusion normalized reads':normed_by_inclusion, 'end':[end]*len(positions), 'sample':[lib.lib_settings.sample_name]*len(positions)}
                temp_df = pd.DataFrame(data=d)
                dfs.append(temp_df)         
    return pd.concat(dfs)
In [44]:
set1_start_meta_pooled = start_meta_pooled_offsets(set1_experiment, [set1_long_offsets])
set2_start_meta_pooled = start_meta_pooled_offsets(set2_experiment, [set2_short_offsets, set2_long_offsets])
set3_start_meta_pooled = start_meta_pooled_offsets(set3_experiment, [set3_short_offsets, set3_long_offsets])
set4_start_meta_pooled = start_meta_pooled_offsets(set4_experiment, [set4_short_offsets, set4_long_offsets])
In [45]:
set1_start_meta_pooled.to_csv('outputs/set1_start_meta_pooled.tsv', sep='\t')
set2_start_meta_pooled.to_csv('outputs/set2_start_meta_pooled.tsv', sep='\t')
set3_start_meta_pooled.to_csv('outputs/set3_start_meta_pooled.tsv', sep='\t')
set4_start_meta_pooled.to_csv('outputs/set4_start_meta_pooled.tsv', sep='\t')
In [46]:
def plot_start_meta_pooled(df, outname, end = '5p', frag_lengths = range(18,34), ylim=(0,10), xlim=(-50, 100)):
    if frag_lengths == 'all':
        frag_lengths = df['fragment lengths'].unique()
    fig = plt.figure(figsize=(len(df['sample'].unique())*4, len(frag_lengths)*2))
    plot_index = 1
    plots = []
    group_df = df.groupby(['sample', 'fragment lengths', 'end'])
    for fragment_length in frag_lengths:
        for sample in df['sample'].unique():
            plot = fig.add_subplot(len(frag_lengths), len(df['sample'].unique()), plot_index)
            sub_df = group_df.get_group((sample, fragment_length, end))
            #df_5p.sort_values(by=['position'], inplace=True)
            sub_df.plot(y='inclusion normalized reads',  x='position', ax=plot, color = ribo_utils.rainbow[0], lw=1.5)
            plot.set_title('%s %smer %d' % (sample, str(fragment_length), sum(sub_df['reads'])), y=0.8, loc='left')
            #plot.vlines([-12,-13, -14], 0, 10, linestyle='dashed')
            plots.append(plot)
            plot_index += 1

    for plot in plots:
        plot.set_ylim(ylim[0],ylim[1])
        #plot.set_xticks(range(upstream, downstream))
        #major_xticks=plot.get_xticks()[::9]
        #minor_xticks=plot.get_xticks()[::3]
        #major_tick_labels=plot.get_xticklabels()[::6]
        #print plot.get_xticklabels()[::6]
        #plot.set_xticks(minor_xticks, minor=True)
        #plot.set_xticks(major_xticks, minor=False)
        #plot.set_xticklabels(major_tick_labels)
        plot.set_ylabel('mean normalized reads')
        plot.set_xlabel('nt from CDS start')
        # Hide the right and top spines
        plot.spines['right'].set_visible(False)
        plot.spines['top'].set_visible(False)
        plot.set_xlim(xlim[0], xlim[1])
        plot.legend_.remove()

    plt.tight_layout()
    plt.savefig(outname, transparent=True)
In [47]:
#note: I overlaid the bottom row of plots in illustrator to make figure 3A
plot_start_meta_pooled(set2_start_meta_pooled, 'outputs/figure3A.pdf', end = '5p', frag_lengths ='all', ylim=(0,8), xlim=(-20, 50))  
In [48]:
plot_start_meta_pooled(set3_start_meta_pooled, 'outputs/figure3B.pdf', end = '5p', frag_lengths = 'all', ylim=(0,8), xlim=(-20, 50))  
In [49]:
#note: I overlaid the rows in illustrator to make figure 4C
plot_start_meta_pooled(set3_start_meta_pooled, 'outputs/figure4C.pdf', end = '5p', frag_lengths = 'all', ylim=(0,12), xlim=(-20, 50))  
In [ ]: